trend_vacc_hb <- daily_vacc_hb %>%
filter(hb_name == "Scotland") %>%
filter(sex =="Total") %>%
filter(age_group == "All vaccinations") %>%
filter(cumulative_number_vaccinated!=0)
#Plot to visualize trend on vaccination.
plot_vaccine <- trend_vacc_hb %>%
ggplot()+
aes(x = date, y = number_vaccinated)+
geom_line(aes(color = dose))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Trend on Vaccination") +
xlab("Year") +
ylab("No of Positive Cases") +
color_theme()+
scale_colour_manual(values = c("#f1a340", "#5ab4ac"))
ggplotly(plot_vaccine)
#Plot to visualise cumulative vaccination trend.
plot_vaccine_cumm <- trend_vacc_hb %>%
ggplot()+
aes(x = date, y = cumulative_number_vaccinated)+
geom_line(aes(color = dose))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Cummulative Trend on Vaccination") +
xlab("Year") +
ylab("No of People Vaccinated") +
color_theme()+
scale_colour_manual(values = c("#f1a340", "#5ab4ac"))+
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))
ggplotly(plot_vaccine_cumm)
NA
Data Preparation
trend_vacc_hb <- trend_vacc_hb %>%
filter (dose == "Dose 2") %>%
select(date,cumulative_number_vaccinated)
# Convert it to zoo type
daily_vacc_hb_zoo <- zoo(trend_vacc_hb$cumulative_number_vaccinated,
order.by=as.Date(trend_vacc_hb$date, format='%m/%d/%Y'))
# Convert it into a time series
daily_vacc_hb_timeseries <-timeSeries::as.timeSeries(daily_vacc_hb_zoo)
#Check if its a time series
is.ts(ts(daily_vacc_hb_timeseries))
[1] TRUE
ARIMA MODEL
Step 1 : Visualise the time series
original_series<-autoplot(daily_vacc_hb_timeseries, colour = '#5ab4ac')+
xlab("Month") +
ylab("Number of People vaccinated")+
#scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Trend on Vaccination") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
ggplotly(original_series)
Step 2 : Identification of model : (Finding d:)
Identify whether the time series is stationary / non stationary we can use ADF Augmented Dickey-Fuller test
adf_test <- adf.test(daily_vacc_hb_timeseries)
The time series is not stationary since we have a high p-value. So we apply difference
first_diff_ts<- diff(daily_vacc_hb_timeseries)
adf_test1 <- adf.test(na.omit(first_diff_ts))
second_diff_ts<- diff(first_diff_ts)
adf_test2 <- adf.test(na.omit(second_diff_ts))
Warning in adf.test(na.omit(second_diff_ts)) :
p-value smaller than printed p-value
adf_test1
Augmented Dickey-Fuller Test
data: na.omit(first_diff_ts)
Dickey-Fuller = -0.75615, Lag order = 6, p-value = 0.9647
alternative hypothesis: stationary
adf_test2
Augmented Dickey-Fuller Test
data: na.omit(second_diff_ts)
Dickey-Fuller = -6.3311, Lag order = 6, p-value = 0.01
alternative hypothesis: stationary
Create a dataframe to compare
adf_data <- data.frame(Data = c("Original", "First-Ordered", "Second Ordered"),
Dickey_Fuller = c(adf_test$statistic, adf_test1$statistic, adf_test2$statistic),
p_value = c(adf_test$p.value,adf_test1$p.value,adf_test2$p.value))
adf_data
Initially the pvalue is high which indicates that the Time Series is not stationary. So we apply difference 2 times. After the second difference, the p-value < significance level (0.05) So we can conclude that the difference data are stationary. So difference (d = 2)
ndiffs(daily_vacc_hb_timeseries)
[1] 2
Let’s plot the First Order and Second Order Difference Series
Order of first difference
first_order<- autoplot(first_diff_ts, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("VACCINATED")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("First-Order Difference Series") +
color_theme()
ggplotly(first_order)
Order of Second difference
second_order<- autoplot(second_diff_ts, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("VACCINATED")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Second-Order Difference Series") +
color_theme()
ggplotly(second_order)
For our model ARIMA (p,d,q), we found d = 2, the next step is to get the values of p and q, the order of AR and MA part. Plot ACF and PACF charts to identify q and p respectively.
par(mfrow=c(2,2))
acf1(first_diff_ts, gg=TRUE, col=2:7, lwd=4)
[1] 0.91 0.87 0.83 0.81 0.81 0.80 0.84 0.77 0.73 0.69 0.67 0.66 0.66 0.69 0.62 0.58 0.55 0.54 0.54 0.56 0.58 0.53 0.50
[24] 0.48 0.46 0.46 0.47
acf1(second_diff_ts, gg=TRUE, col=2:7, lwd=4)
[1] -0.30 0.00 -0.11 -0.08 -0.03 -0.19 0.61 -0.18 -0.05 -0.06 -0.09 -0.04 -0.13 0.53 -0.17 -0.06 -0.12 -0.07 -0.08
[20] -0.05 0.47 -0.17 0.01 -0.09 -0.06 -0.07 -0.09
acf1(first_diff_ts, pacf=TRUE, gg=TRUE, col=2:7, lwd=4)
[1] 0.91 0.25 0.06 0.12 0.15 0.13 0.31 -0.47 -0.10 0.09 0.01 0.05 0.07 0.05 -0.25 -0.04 0.04 0.13 0.09
[20] 0.11 0.03 -0.13 0.03 -0.06 -0.06 -0.01 0.00
acf1(second_diff_ts, pacf=TRUE, gg=TRUE, col=2:7, lwd=4)
[1] -0.30 -0.10 -0.15 -0.18 -0.16 -0.36 0.48 0.12 -0.10 -0.02 -0.06 -0.08 -0.06 0.23 0.03 -0.05 -0.16 -0.12 -0.13
[20] -0.04 0.11 -0.05 0.06 0.07 0.02 0.01 -0.12
We see that the sample ACF cuts off after lag 1 and the sample PACF cuts off after lag 1.
So we propose three ARMA models for the differenced data: ARMA(p,q) ARMA(0,1), ARMA(1,0) and ARMA(1,1).
That is, for the original time series, we propose three ARIMA models,ARIMA(p,d,q) ARIMA(0,2,1), ARIMA(1,2,0) and ARMA(1,2,1).
4 (a) Manual method:
arima_fit1 = Arima(daily_vacc_hb_timeseries, order = c(1,2,0))
arima_fit2 = Arima(daily_vacc_hb_timeseries, order = c(0,2,1))
arima_fit3 = Arima(daily_vacc_hb_timeseries, order = c(1,2,1))
arima_fit4 = Arima(daily_vacc_hb_timeseries, order = c(3,2,2))
summary(arima_fit1)
Series: daily_vacc_hb_timeseries
ARIMA(1,2,0)
Coefficients:
ar1
-0.2945
s.e. 0.0567
sigma^2 estimated as 20328963: log likelihood=-2782.2
AIC=5568.41 AICc=5568.45 BIC=5575.7
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 10.24122 4484.972 2576.654 0.4096955 1.937803 0.1895856 -0.02928208
summary(arima_fit2)
Series: daily_vacc_hb_timeseries
ARIMA(0,2,1)
Coefficients:
ma1
-0.4289
s.e. 0.0738
sigma^2 estimated as 19743903: log likelihood=-2778.13
AIC=5560.26 AICc=5560.3 BIC=5567.55
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 13.72387 4419.963 2555.154 0.455319 1.891454 0.1880037 0.0650249
summary(arima_fit3)
Series: daily_vacc_hb_timeseries
ARIMA(1,2,1)
Coefficients:
ar1 ma1
0.3425 -0.7230
s.e. 0.0997 0.0706
sigma^2 estimated as 19180848: log likelihood=-2773.58
AIC=5553.15 AICc=5553.24 BIC=5564.09
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 18.59485 4348.752 2506.988 0.5190314 1.823004 0.1844597 -0.01064161
summary(arima_fit4)
Series: daily_vacc_hb_timeseries
ARIMA(3,2,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.8243 -0.4487 -0.4144 -1.2766 0.9213
s.e. 0.0598 0.0704 0.0566 0.0358 0.0312
sigma^2 estimated as 16167581: log likelihood=-2748.77
AIC=5509.53 AICc=5509.84 BIC=5531.4
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 13.84351 3971.207 2264.727 0.4282413 1.784024 0.1666346 -0.05914206
4 (b) Automated method:
auto_arima_fit <- auto.arima(daily_vacc_hb_timeseries,
seasonal=FALSE
)
summary(auto_arima_fit)
Series: daily_vacc_hb_timeseries
ARIMA(2,2,5)
Coefficients:
ar1 ar2 ma1 ma2 ma3 ma4 ma5
-0.4419 -0.9428 0.0995 0.9051 -0.5686 0.0351 -0.3973
s.e. 0.0263 0.0316 0.0601 0.0574 0.0643 0.0565 0.0488
sigma^2 estimated as 15627175: log likelihood=-2742.83
AIC=5501.66 AICc=5502.19 BIC=5530.82
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 18.03778 3890.204 2292.291 0.4877816 1.799259 0.1686627 0.004919773
autoplot(auto_arima_fit)
We use the following measure to select the model
Akaike Information Criterion (AIC), Root Mean Squared Error (RMSE), and Mean Absolute Percentage Error (MAPE)
Based on that, An ARIMA(2, 2, 5) model seems best.
Let’s plot the diagnostics with the results to make sure the normality and correlation assumptions for the model hold. If the residuals look like white noise, proceed with forecast and prediction, otherwise repeat the model building.
ggtsdiag(auto.arima(daily_vacc_hb_timeseries))
Other method:
checkresiduals(auto_arima_fit)
Ljung-Box test
data: Residuals from ARIMA(2,2,5)
Q* = 59.66, df = 3, p-value = 6.948e-13
Model df: 7. Total lags used: 10
The ACF plot of the residuals from the ARIMA(2,2,5) model shows that all auto correlations are within the threshold limits, indicating that the residuals are behaving like white noise. A portmanteau test returns a smaller p-value, also suggesting that the residuals are white noise.
#plotting the series along with the fitted values
arima_fit <- ts(daily_vacc_hb_timeseries) - resid(auto_arima_fit)
autoplot(ts(daily_vacc_hb_timeseries), colour = '#5ab4ac') +
autolayer(arima_fit) +
xlab("Month") +
ylab("Number of People vaccinated")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Fitting the ARIMA model with existing data") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
8 Forecast/ Predict using the model
#Plotting the Vaccination series plus the forecast and 95% prediction intervals
#forecast using autoplot
forecast_model <- forecast(arima_fit,level = c(80, 95), h = 30)
annotation <- data.frame(
x = c(50,305),
y = c(1000000,3000000),
label = c("PAST", "FUTURE")
)
forecast_model %>%
autoplot(colour ='#f1a340') +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Forecast for 1 month") +
xlab("Year") +
ylab("No of people vaccinated") +
scale_y_continuous(labels = scales::comma)+
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme() +
geom_text(data=annotation,
aes( x=x, y=y, label=label),
color="red",
size=4 )+
geom_vline(xintercept =285, linetype = 2)
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
EXTRA
Forecast the Manual ARIMA model
# Forecast the manual models
future = forecast(arima_fit1, h = 30)
future2 = forecast(arima_fit2, h = 30)
future3 = forecast(arima_fit3, h = 30)
future4 = forecast(arima_fit4, h = 30)
#Plot the forecasted manual models
par(mfrow = c(2,2))
plot(future)
plot(future2)
plot(future3)
plot(future4)
## partition into train and test
train_series=daily_vacc_hb_timeseries[1:140]
test_series=daily_vacc_hb_timeseries[141:150]
## make arima models
arimaModel_1=arima(train_series, order=c(0,2,1))
arimaModel_2=arima(train_series, order=c(1,2,1))
arimaModel_3=arima(train_series, order=c(1,2,0))
## look at the parameters
print(arimaModel_1);
Call:
arima(x = train_series, order = c(0, 2, 1))
Coefficients:
ma1
-0.4581
s.e. 0.1041
sigma^2 estimated as 32122914: log likelihood = -1388.6, aic = 2781.2
print(arimaModel_2);
Call:
arima(x = train_series, order = c(1, 2, 1))
Coefficients:
ar1 ma1
0.3101 -0.7197
s.e. 0.1394 0.0981
sigma^2 estimated as 31237633: log likelihood = -1386.72, aic = 2779.44
print(arimaModel_3)
Call:
arima(x = train_series, order = c(1, 2, 0))
Coefficients:
ar1
-0.331
s.e. 0.080
sigma^2 estimated as 32981001: log likelihood = -1390.36, aic = 2784.72